# Snakefile
# Tells which files to generate as specified by the rule 'all'

# Snakefile to quality check and estimate counts from RNA-Seq data on a reference genome (e.g. human genome hg19)
# Tells which files to generate as specified by the rule 'all'

import os
import subprocess

#########################
## Pipeline configuration
#########################

# configuration file
configfile: "config.yaml"

# working directory (o clean afterwards)
WORKING_DIR = config["workdir"]
RESULT_DIR = config["resultdir"]

# Number of CPU (threads)
THREADS = config["threads"]

# Trimmomatic
ADAPTERFILE = config["trimmomatic"]["adapters"]
TRIMMOMATIC = config["trimmomatic"]["jarfile"]

##################
## Desired outputs
##################
FASTQC_REPORTS = expand("fastqc/trimmed/{sample}/{pair}/{sample}_{pair}_fastqc.html",sample=config["samples"],pair=["forward","reverse"])
MAPPING_LOGS = expand(RESULT_DIR + "mappinglogs/{sample}_Log.final.out",sample=config["samples"]) 
COUNTS = [RESULT_DIR + f for f in ["counts/gene_counts.txt","counts/repeat_counts.txt"]] 
BIGWIGS = expand(RESULT_DIR + "bigwig/{sample}_Signal.{uniqueness}.{strand}.normalized.bw",sample=config["samples"],uniqueness=["Unique","UniqueMultiple"],strand=["str1","str2"])

MASTER_FILES = [RESULT_DIR + f for f in ["Snakefile","config.yaml","environment.yaml"]]

rule all:
	input:
		MAPPING_LOGS,
		COUNTS,
		BIGWIGS,
		MASTER_FILES 
	message: "RNA-Seq pipeline has been successfully run"

#######################
##  copy master files
#######################
rule copy_master_files:
    output:
        RESULT_DIR + "Snakefile",
        RESULT_DIR + "config.yaml",
        RESULT_DIR + "environment.yaml"
    message:"copying master files (Snakefile and configuration files)"
    shell:
        "cp Snakefile {RESULT_DIR};"
        "cp config.yaml {RESULT_DIR};"
        "cp environment.yaml {RESULT_DIR}"

#########
## bigWig
#########
rule convert_to_bigWig: 
    input:
        bedgraph = WORKING_DIR + "mapped/{sample}_Signal.{uniqueness}.{strand}.out.sorted.bg",
        chroms = "chromSizes.tab"
    output:
        RESULT_DIR + "bigwig/{sample}_Signal.{uniqueness}.{strand}.normalized.bw"
    message:"converting strand-specific bedGraph files for {wildcards.sample}, {wildcards.uniqueness} reads and {wildcards.strand} strand to bigWig format"
    shell:
        "bedGraphToBigWig {input.bedgraph} {input.chroms} {output}"        

rule generate_chrom_sizes:
    input:
        genome2bit = config["refseqs"]["genome2bit"]
    output:
        "chromSizes.tab"
    message:"generating chromosome sizes"
    shell:"twoBitInfo {input} {output}"

rule sort_bedgraph:
    input:
         WORKING_DIR + "mapped/{sample}_Signal.{uniqueness}.{strand}.out.bg"
    output:
         WORKING_DIR + "mapped/{sample}_Signal.{uniqueness}.{strand}.out.sorted.bg"
    message:"sorting {wildcards.sample} bedGraph file for strand {wildcards.strand} and {wildcards.uniqueness} reads"
    shell:
        "bedSort {input} {output}"   

################################################
## Count read alignments per feature (e.g. gene)
################################################
rule count_repeats:
    input:
        bams = expand(WORKING_DIR + "mapped/{sample}_Aligned.sortedByCoord.out.bam",sample=config["samples"].keys()),
        annotation = config["annotations"]["gtf4repeats"],
        genome = config["refseqs"]["genomefasta"]
    output:
        RESULT_DIR + "counts/repeat_counts.txt"
    message:"summarizing read counts for repeats"
    shell:
        "featureCounts "
        "-f "				# summarize only at exon level
	"-B "				# Count only properly paired reads
	"-p "				# fragments will be counted instead of reads
	"-s 2 "                         #0= unstranded (default), 1= stranded, 2= reversely stranded
        #"-t exon "			# specify how to count reads at feature level (e.g. exon)
        "-T {THREADS} "
        "-G {input.genome} "		# ref genome sequence
        "-a {input.annotation} "	# annotation 
        "-F GTF "			# SAF or GTF (by default GTF)
        "-o {output} "
        "{input.bams}"               	# can process more than one BAM file at once 

rule count_genes:
    input:
        bams = expand(WORKING_DIR + "mapped/{sample}_Aligned.sortedByCoord.out.bam",sample=config["samples"].keys()),
        annotation = config["annotations"]["refseqFromSubread"],
        genome = config["refseqs"]["genomefasta"]
    output:
        RESULT_DIR + "counts/gene_counts.txt"
    message:"summarizing read counts for genes"
    shell:
        "featureCounts "
	"-B "                           # Count only properly paired reads
        "-p "                           # fragments will be counted instead of reads
	"-s 2 "				#0= unstranded (default), 1= stranded, 2= reversely stranded
        "-t exon "			# specify how to count reads at feature level (e.g. exon)
        "-g gene_id " 			# specify how to count reads at meta-feature level (e.g. gene)
        "-T {THREADS} "
        "-G {input.genome} "		# ref genome sequence
        "-a {input.annotation} "	# annotation 
        "-F SAF "			# SAF or GTF (by default GTF)
        "-o {output} "
        "{input.bams}"               	# can process more than one BAM file at once 
        


#############################
# Genome alignment using STAR
#############################
rule copy_mapping_logs:
    input:
        WORKING_DIR + "mapped/{sample}_Log.final.out"
    output:
        RESULT_DIR + "mappinglogs/{sample}_Log.final.out" 
    message:"moving mapping log for {wildcards.sample} in {output}"
    shell:"mv {input} {output}"

rule map_to_genome_using_STAR:
    input:
        ref = [WORKING_DIR + "star2pass/"+f for f in ["chrLength.txt","chrNameLength.txt","chrName.txt","chrStart.txt","Genome","genomeParameters.txt","SA","SAindex"]],
        forward = WORKING_DIR + "trimmed/{sample}_forward.fastq",
        reverse = WORKING_DIR + "trimmed/{sample}_reverse.fastq"
    output:
        WORKING_DIR + "mapped/{sample}_Aligned.sortedByCoord.out.bam",
        WORKING_DIR + "mapped/{sample}_Aligned.sortedByCoord.out.bam.bai",
        WORKING_DIR + "mapped/{sample}_Log.final.out",
	WORKING_DIR + "mapped/{sample}_Signal.Unique.str1.out.bg",
        WORKING_DIR + "mapped/{sample}_Signal.Unique.str2.out.bg",
        WORKING_DIR + "mapped/{sample}_Signal.UniqueMultiple.str1.out.bg",
        WORKING_DIR + "mapped/{sample}_Signal.UniqueMultiple.str2.out.bg"        
    message:"mapping the {wildcards.sample} reads to genome"
    params:
        prefix = WORKING_DIR + "mapped/{sample}_",
        maxmismatches = config["star"]["mismatches"],
        unmapped = config["star"]["unmapped"]	,
        multimappers = config["star"]["multimappers"],
        matchNminoverLread = config["star"]["matchminoverlengthread"],
	outSamType = config["star"]["samtype"],
        outWigType = config["star"]["outwigtype"],
        outWigStrand = config["star"]["outwigstrand"],
        outWigNorm = config["star"]["outwignorm"],
        genomedir = WORKING_DIR + "star2pass/"
    shell:
            "STAR --genomeDir {params.genomedir} "
            "--readFilesIn {input.forward} {input.reverse} "
            "--outFilterMultimapNmax {params.multimappers} "
            "--outFilterMismatchNmax {params.maxmismatches} "
            "--outFilterMatchNminOverLread {params.matchNminoverLread} "
            "--alignEndsType EndToEnd "
            "--runThreadN {THREADS} "
            "--outReadsUnmapped {params.unmapped} "
            "--outFileNamePrefix {params.prefix} "
            "--outSAMtype {params.outSamType} "
            "--outWigType {params.outWigType} "
            "--outWigStrand {params.outWigStrand} "
            "--outWigNorm {params.outWigNorm};"
            "samtools index {output[0]}"

#####################################################################
## STAR 2-pass: genome indexing + splice junctions database generation 
#####################################################################
rule star2pass_index:
    input:
        sjdb = WORKING_DIR + "star1pass/SJ.concatenated.out.tab", 
        ref= config["refseqs"]["genomefasta"],
        gtf = config["annotations"]["gtf4genes"]
    output:
        STAR_2PASS = [WORKING_DIR + "star2pass/"+ f for f in ["chrLength.txt","chrNameLength.txt","chrName.txt","chrStart.txt","Genome","genomeParameters.txt","SA","SAindex"]]
    message: "STAR 2nd pass: generating genome index"	
    params:
        WORKING_DIR + "star2pass/"
    shell:
        "STAR --runMode genomeGenerate "
        "--genomeDir {params} "
        "--genomeFastaFiles {input.ref} "
        "--runThreadN {THREADS} "
        "--sjdbFileChrStartEnd {input.sjdb} "
        "--sjdbOverhang 99 "
        "--sjdbGTFfile {input.gtf};"
        "touch -h {output}"

rule concatenate_sjdb:
    input:
        expand(WORKING_DIR + "star1pass/{sample}_SJ.out.tab",sample=config["samples"].keys()),
    output:
        WORKING_DIR + "star1pass/SJ.concatenated.out.tab"
    message:"concatenating splice junctions from different samples "
    shell:"cat {input} >> {output}"

rule star1pass_align:
    input:
        forward = WORKING_DIR + "trimmed/{sample}_forward.fastq",
        reverse = WORKING_DIR + "trimmed/{sample}_reverse.fastq",
        ref = WORKING_DIR + "star_index/"
    output:
        WORKING_DIR + "star1pass/{sample}_SJ.out.tab",
        temp(WORKING_DIR + "star1pass/{sample}_Aligned.out.sam")
    message:"STAR 1st pass: aligning {wildcards.sample} reads to generate splice junction files"
    params:
        WORKING_DIR + "star1pass/{sample}_"	
    shell: 		
        "STAR --runMode alignReads "
        "--genomeDir {input.ref} "
        "--readFilesIn {input.forward} {input.reverse} "
        "--outFileNamePrefix {params} "
        "--outFilterIntronMotifs RemoveNoncanonical "
        "--runThreadN {THREADS}"

 # sdjbOverhang specifies the length of the genomic sequence around the annotated junction to be used in constructing the splie junctions database. Ideally this length should be equal to ReadLength-1
rule star_genome_index:
    input:
        genome = config["refseqs"]["genomefasta"],
        gtf = config["annotations"]["gtf4genes"]
    output:
        WORKING_DIR + "star_index/"
    message:"generation STAR genome index" 
    params:
        WORKING_DIR + "star_index/"
    shell:
        "mkdir -p {params};"
        "STAR --runMode genomeGenerate "
        "--genomeDir {params} "
        "--genomeFastaFiles {input.genome} "
        "--runThreadN {THREADS} "
        "--sjdbOverhang 99 "
        "--sjdbGTFfile {input.gtf}"

################################
## Fastqc reports after trimming
################################
rule fastqc_after_trimming:
    input:
        WORKING_DIR + "trimmed/{sample}_{pair}.fastq"
    output:
        "fastqc/trimmed/{sample}/{pair}/{sample}_{pair}_fastqc.html"
    message:"generating fastqc report for trimmed reverse {wildcards.sample} reads"
    params:
        "fastqc/trimmed/{sample}/{pair}/"
    shell:
        "fastqc --threads {THREADS} --outdir={params} {input}"
    
###########
## Trimming
###########
rule trimmomaticPaired:
    input:
        ADAPTERFILE,
        forward = lambda wildcards: config["fastqdir"] + config["samples"][wildcards.sample]["forward"],
        reverse = lambda wildcards: config["fastqdir"] + config["samples"][wildcards.sample]["reverse"]
    output:
        forward = WORKING_DIR + "trimmed/{sample}_forward.fastq",
        reverse = WORKING_DIR + "trimmed/{sample}_reverse.fastq",
        forwardUnpaired  = WORKING_DIR + "trimmed/{sample}_forward_unpaired.fastq",
        reverseUnpaired = WORKING_DIR + "trimmed/{sample}_reverse_unpaired.fastq",
    message:"Trimming {wildcards.sample} using Trimmomatic"
    log:"results/trimlogs/{sample}.trimlog"
    params :
        seedMisMatches =            str(config['trimmomatic']['seedMisMatches']),
        palindromeClipTreshold =    str(config['trimmomatic']['palindromeClipTreshold']),
        simpleClipThreshhold =      str(config['trimmomatic']['simpleClipThreshold']),
        LeadMinTrimQual =           str(config['trimmomatic']['LeadMinTrimQual']),
        TrailMinTrimQual =          str(config['trimmomatic']['TrailMinTrimQual']),
        windowSize =                str(config['trimmomatic']['windowSize']),
        avgMinQual =                str(config['trimmomatic']['avgMinQual']),
        minReadLen =                str(config['trimmomatic']['minReadLength']),
        phred = 		    str(config["trimmomatic"]["phred"])
    shell:
        "java -jar {TRIMMOMATIC} PE {params.phred} -threads {THREADS} "
        "{input.forward} {input.reverse} "
        "{output.forward} {output.forwardUnpaired} "
        "{output.reverse} {output.reverseUnpaired} "
        "ILLUMINACLIP:{ADAPTERFILE}:{params.seedMisMatches}:{params.palindromeClipTreshold}:{params.simpleClipThreshhold} "
        "LEADING:{params.LeadMinTrimQual} "
        "TRAILING:{params.TrailMinTrimQual} "
        "SLIDINGWINDOW:{params.windowSize}:{params.avgMinQual} "
        "MINLEN:{params.minReadLen} 2>{log}"


